import pandas as pd
import plotly
df = pd.read_csv(("C:\\Users\\sasikumarchennova\\Documents\\Data Science\\Excel\\project\\Employee_Attrition_with_missing.csv"))
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1470 entries, 0 to 1469 Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 employee_id 1470 non-null int64 1 Age 1470 non-null float64 2 BusinessTravel 1470 non-null object 3 DailyRate 1470 non-null float64 4 Department 1470 non-null object 5 DistanceFromHome 1470 non-null float64 6 Education 1470 non-null object 7 EducationField 1470 non-null object 8 EnvironmentSatisfaction 1470 non-null object 9 Gender 1470 non-null object 10 HourlyRate 1470 non-null float64 11 JobInvolvement 1470 non-null object 12 JobLevel 1470 non-null float64 13 JobRole 1470 non-null object 14 JobSatisfaction 1470 non-null object 15 MaritalStatus 1470 non-null object 16 MonthlyIncome 1470 non-null float64 17 MonthlyRate 1470 non-null float64 18 NumCompaniesWorked 1470 non-null float64 19 OverTime 1470 non-null object 20 PercentSalaryHike 1470 non-null float64 21 PerformanceRating 1470 non-null object 22 RelationshipSatisfaction 1470 non-null object 23 TrainingTimesLastYear 1470 non-null float64 24 WorkLifeBalance 1470 non-null object 25 Attrition 1470 non-null object dtypes: float64(10), int64(1), object(15) memory usage: 298.7+ KB
df
| employee_id | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | ... | MonthlyIncome | MonthlyRate | NumCompaniesWorked | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | TrainingTimesLastYear | WorkLifeBalance | Attrition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1453 | 41.0 | Travel_Rarely | 1102.0 | Sales | 1.0 | 2College | Life Sciences | 2Medium | Female | ... | 5993.0 | 19479.0 | 8.0 | Yes | 11.0 | 3Excellent | 1Low | 0.0 | 1Bad | Yes |
| 1 | 1454 | 49.0 | Travel_Frequently | 279.0 | Research & Development | 8.0 | 1Below College | Life Sciences | 3High | Male | ... | 5130.0 | 24907.0 | 1.0 | No | 23.0 | 4Outstanding | 4Very High | 3.0 | 3Better | No |
| 2 | 1455 | 37.0 | Travel_Rarely | 1373.0 | Research & Development | 2.0 | 2College | Other | 4Ver High | Male | ... | 2090.0 | 2396.0 | 6.0 | Yes | 15.0 | 3Excellent | 2Medium | 3.0 | 3Better | Yes |
| 3 | 1456 | 33.0 | Travel_Frequently | 1392.0 | Research & Development | 3.0 | 4Master | Life Sciences | 4Ver High | Female | ... | 2909.0 | 23159.0 | 1.0 | Yes | 11.0 | 3Excellent | 3High | 3.0 | 3Better | No |
| 4 | 1457 | 27.0 | Travel_Rarely | 591.0 | Research & Development | 2.0 | 1Below College | Medical | 1Low | Male | ... | 3468.0 | 16632.0 | 9.0 | No | 12.0 | 3Excellent | 4Very High | 3.0 | 3Better | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 2918 | 36.0 | Travel_Frequently | 884.0 | Research & Development | 23.0 | 2College | Medical | 3High | Male | ... | 2571.0 | 12290.0 | 4.0 | No | 17.0 | 3Excellent | 3High | 3.0 | 3Better | No |
| 1466 | 2919 | 39.0 | Travel_Rarely | 613.0 | Research & Development | 6.0 | 1Below College | Medical | 4Ver High | Male | ... | 9991.0 | 21457.0 | 4.0 | No | 15.0 | 3Excellent | 1Low | 5.0 | 3Better | No |
| 1467 | 2920 | 27.0 | Travel_Rarely | 155.0 | Research & Development | 4.0 | 3Bachelor | Life Sciences | 2Medium | Male | ... | 6142.0 | 5174.0 | 1.0 | Yes | 20.0 | 4Outstanding | 2Medium | 0.0 | 3Better | No |
| 1468 | 2921 | 49.0 | Travel_Frequently | 1023.0 | Sales | 2.0 | 3Bachelor | Medical | 4Ver High | Male | ... | 5390.0 | 13243.0 | 2.0 | No | 14.0 | 3Excellent | 4Very High | 3.0 | 2Good | No |
| 1469 | 2922 | 34.0 | Travel_Rarely | 628.0 | Research & Development | 8.0 | 3Bachelor | Medical | 2Medium | Male | ... | 4404.0 | 10228.0 | 2.0 | No | 12.0 | 3Excellent | 1Low | 3.0 | 4Best | No |
1470 rows × 26 columns
df.columns
Index(['employee_id', 'Age', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'EducationField',
'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
'TrainingTimesLastYear', 'WorkLifeBalance', 'Attrition'],
dtype='object')
def fillna_mode(data,variable):
data[variable].fillna(data[variable].mode()[0],inplace = True)
return data[variable]
def fillna_mean(data, variable):
mean_value = int(round(data[variable].mean(), 0))
data[variable].fillna(mean_value, inplace=True)
data[variable] = data[variable].astype(int)
return data[variable]
emp_id = df['employee_id']
age = pd.DataFrame(fillna_mean(df,'Age'))
business_trvl = pd.DataFrame(fillna_mode(df,'BusinessTravel'))
daily_rate = pd.DataFrame(fillna_mean(df,'DailyRate'))
dept = pd.DataFrame(fillna_mode(df,'Department'))
dfh = pd.DataFrame(fillna_mean(df,'DistanceFromHome'))
education = pd.DataFrame(fillna_mode(df,'Education').str[1:])
edu_field = pd.DataFrame(fillna_mode(df,'EducationField'))
env_sat = pd.DataFrame(fillna_mode(df,'EnvironmentSatisfaction').str[1:])
gender = pd.DataFrame(fillna_mode(df,'Gender'))
hr_rate = pd.DataFrame(fillna_mean(df,'HourlyRate'))
job_inv = pd.DataFrame(fillna_mode(df,'JobInvolvement').str[1:])
job_level = pd.DataFrame(fillna_mean(df,'JobLevel'))
job_role = pd.DataFrame(fillna_mode(df,'JobRole'))
job_satis = pd.DataFrame(fillna_mode(df,'JobSatisfaction').str[1:])
mar_staus = pd.DataFrame(fillna_mode(df,'MaritalStatus'))
mon_inc = pd.DataFrame(fillna_mean(df,'MonthlyIncome'))
mon_rat = pd.DataFrame(fillna_mean(df,'MonthlyRate'))
no_com_wor = pd.DataFrame(fillna_mean(df,'NumCompaniesWorked'))
ovr_tm = pd.DataFrame(fillna_mode(df,'OverTime'))
per_sal_hike = pd.DataFrame(fillna_mean(df,'PercentSalaryHike'))
per_rat = pd.DataFrame(fillna_mode(df,'PerformanceRating').str[1:])
rela_sat = pd.DataFrame(fillna_mode(df,'RelationshipSatisfaction').str[1:])
tr_last_yr = pd.DataFrame(fillna_mean(df,'TrainingTimesLastYear'))
wrk_lf_baln = pd.DataFrame(fillna_mode(df,'WorkLifeBalance').str[1:])
attr = pd.DataFrame(fillna_mode(df,'Attrition'))
table = pd.concat([emp_id,age, business_trvl,daily_rate,dept,dfh,education,edu_field,env_sat,gender,hr_rate,job_inv,job_level,job_role,job_satis,
mar_staus,mon_inc,mon_rat,no_com_wor,ovr_tm,per_sal_hike,per_rat,rela_sat,tr_last_yr,wrk_lf_baln,attr], axis=1)
table.columns
Index(['employee_id', 'Age', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'EducationField',
'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
'TrainingTimesLastYear', 'WorkLifeBalance', 'Attrition'],
dtype='object')
table.to_csv('C:\\Users\\sasikumarchennova\\Documents\\Data Science\\Python\\project\\cleaned_table.csv',index= False)
df = pd.read_csv(("C:\\Users\\sasikumarchennova\\Documents\\Data Science\\Python\\project\\cleaned_table.csv"))
df
| employee_id | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | ... | MonthlyIncome | MonthlyRate | NumCompaniesWorked | OverTime | PercentSalaryHike | PerformanceRating | RelationshipSatisfaction | TrainingTimesLastYear | WorkLifeBalance | Attrition | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1453 | 41 | Travel_Rarely | 1102 | Sales | 1 | College | Life Sciences | Medium | Female | ... | 5993 | 19479 | 8 | Yes | 11 | Excellent | Low | 0 | Bad | Yes |
| 1 | 1454 | 49 | Travel_Frequently | 279 | Research & Development | 8 | Below College | Life Sciences | High | Male | ... | 5130 | 24907 | 1 | No | 23 | Outstanding | Very High | 3 | Better | No |
| 2 | 1455 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | College | Other | Ver High | Male | ... | 2090 | 2396 | 6 | Yes | 15 | Excellent | Medium | 3 | Better | Yes |
| 3 | 1456 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | Master | Life Sciences | Ver High | Female | ... | 2909 | 23159 | 1 | Yes | 11 | Excellent | High | 3 | Better | No |
| 4 | 1457 | 27 | Travel_Rarely | 591 | Research & Development | 2 | Below College | Medical | Low | Male | ... | 3468 | 16632 | 9 | No | 12 | Excellent | Very High | 3 | Better | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 2918 | 36 | Travel_Frequently | 884 | Research & Development | 23 | College | Medical | High | Male | ... | 2571 | 12290 | 4 | No | 17 | Excellent | High | 3 | Better | No |
| 1466 | 2919 | 39 | Travel_Rarely | 613 | Research & Development | 6 | Below College | Medical | Ver High | Male | ... | 9991 | 21457 | 4 | No | 15 | Excellent | Low | 5 | Better | No |
| 1467 | 2920 | 27 | Travel_Rarely | 155 | Research & Development | 4 | Bachelor | Life Sciences | Medium | Male | ... | 6142 | 5174 | 1 | Yes | 20 | Outstanding | Medium | 0 | Better | No |
| 1468 | 2921 | 49 | Travel_Frequently | 1023 | Sales | 2 | Bachelor | Medical | Ver High | Male | ... | 5390 | 13243 | 2 | No | 14 | Excellent | Very High | 3 | Good | No |
| 1469 | 2922 | 34 | Travel_Rarely | 628 | Research & Development | 8 | Bachelor | Medical | Medium | Male | ... | 4404 | 10228 | 2 | No | 12 | Excellent | Low | 3 | Best | No |
1470 rows × 26 columns
df.columns
Index(['employee_id', 'Age', 'BusinessTravel', 'DailyRate', 'Department',
'DistanceFromHome', 'Education', 'EducationField',
'EnvironmentSatisfaction', 'Gender', 'HourlyRate', 'JobInvolvement',
'JobLevel', 'JobRole', 'JobSatisfaction', 'MaritalStatus',
'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked', 'OverTime',
'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
'TrainingTimesLastYear', 'WorkLifeBalance', 'Attrition'],
dtype='object')
import pandas as pd
import plotly.express as px
def univariate_table_diagram(df, variable, title):
table = pd.DataFrame(df.groupby([variable]).size().reset_index())
table.columns = ['Category', 'Counts']
table['Percentage'] = round(table['Counts'] / sum(table['Counts']) * 100, 0)
table['Percentage_with_sign'] = table['Percentage'].astype(str) + "%"
fig = px.bar(table,
x='Category',
y='Counts',
title=title,
text=table['Percentage_with_sign'])
fig.update_layout(template='plotly_white', title_x=.5)
fig.show()
return table
pd.DataFrame(univariate_table_diagram(df,'BusinessTravel','Business Travel '))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Non-Travel | 148 | 10.0 | 10.0% |
| 1 | Travel_Frequently | 273 | 19.0 | 19.0% |
| 2 | Travel_Rarely | 1049 | 71.0 | 71.0% |
pd.DataFrame(univariate_table_diagram(df,'Department','Department'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Human Resources | 62 | 4.0 | 4.0% |
| 1 | Research & Development | 966 | 66.0 | 66.0% |
| 2 | Sales | 442 | 30.0 | 30.0% |
pd.DataFrame(univariate_table_diagram(df,'Education','Education'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Bachelor | 583 | 40.0 | 40.0% |
| 1 | Below College | 169 | 11.0 | 11.0% |
| 2 | College | 279 | 19.0 | 19.0% |
| 3 | Doctor | 48 | 3.0 | 3.0% |
| 4 | Master | 391 | 27.0 | 27.0% |
pd.DataFrame(univariate_table_diagram(df,'EducationField','Education Field'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Human Resources | 27 | 2.0 | 2.0% |
| 1 | Life Sciences | 613 | 42.0 | 42.0% |
| 2 | Marketing | 157 | 11.0 | 11.0% |
| 3 | Medical | 459 | 31.0 | 31.0% |
| 4 | Other | 82 | 6.0 | 6.0% |
| 5 | Technical Degree | 132 | 9.0 | 9.0% |
pd.DataFrame(univariate_table_diagram(df,'EnvironmentSatisfaction','Environment Satisfaction'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | High | 468 | 32.0 | 32.0% |
| 1 | Low | 279 | 19.0 | 19.0% |
| 2 | Medium | 282 | 19.0 | 19.0% |
| 3 | Ver High | 441 | 30.0 | 30.0% |
pd.DataFrame(univariate_table_diagram(df,'Gender','Gender'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Female | 583 | 40.0 | 40.0% |
| 1 | Male | 887 | 60.0 | 60.0% |
pd.DataFrame(univariate_table_diagram(df,'JobInvolvement','Job Involvement'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | High | 873 | 59.0 | 59.0% |
| 1 | Low | 81 | 6.0 | 6.0% |
| 2 | Medium | 372 | 25.0 | 25.0% |
| 3 | Very High | 144 | 10.0 | 10.0% |
pd.DataFrame(univariate_table_diagram(df,'JobRole','Job Role'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Healthcare Representative | 130 | 9.0 | 9.0% |
| 1 | Human Resources | 52 | 4.0 | 4.0% |
| 2 | Laboratory Technician | 256 | 17.0 | 17.0% |
| 3 | Manager | 102 | 7.0 | 7.0% |
| 4 | Manufacturing Director | 144 | 10.0 | 10.0% |
| 5 | Research Director | 79 | 5.0 | 5.0% |
| 6 | Research Scientist | 290 | 20.0 | 20.0% |
| 7 | Sales Executive | 335 | 23.0 | 23.0% |
| 8 | Sales Representative | 82 | 6.0 | 6.0% |
pd.DataFrame(univariate_table_diagram(df,'JobSatisfaction','Job Satisfaction'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | High | 440 | 30.0 | 30.0% |
| 1 | Low | 286 | 19.0 | 19.0% |
| 2 | Medium | 278 | 19.0 | 19.0% |
| 3 | Very High | 466 | 32.0 | 32.0% |
pd.DataFrame(univariate_table_diagram(df,'MaritalStatus','Marital Status'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Divorced | 326 | 22.0 | 22.0% |
| 1 | Married | 680 | 46.0 | 46.0% |
| 2 | Single | 464 | 32.0 | 32.0% |
pd.DataFrame(univariate_table_diagram(df,'PerformanceRating','Performance Rating'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Excellent | 1248 | 85.0 | 85.0% |
| 1 | Outstanding | 222 | 15.0 | 15.0% |
pd.DataFrame(univariate_table_diagram(df,'RelationshipSatisfaction','Relationship Satisfaction'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | High | 467 | 32.0 | 32.0% |
| 1 | Low | 274 | 19.0 | 19.0% |
| 2 | Medium | 301 | 20.0 | 20.0% |
| 3 | Very High | 428 | 29.0 | 29.0% |
pd.DataFrame(univariate_table_diagram(df,'WorkLifeBalance','Work Life Balance'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Bad | 80 | 5.0 | 5.0% |
| 1 | Best | 153 | 10.0 | 10.0% |
| 2 | Better | 896 | 61.0 | 61.0% |
| 3 | Good | 341 | 23.0 | 23.0% |
import pandas as pd
import plotly.express as px
def univariate_pie_chart(df, variable, title):
table = pd.DataFrame(df.groupby([variable]).size().reset_index())
table.columns = ['Category', 'Counts']
table['Percentage'] = round(table['Counts'] / sum(table['Counts']) * 100, 0)
table['Percentage_with_sign'] = table['Percentage'].astype(str) + "%"
fig = px.pie(table, values='Counts', names='Category', title=title)
fig.update_layout(template='seaborn', title_x=0.5)
fig.show()
return table
pd.DataFrame(univariate_pie_chart(df,'WorkLifeBalance','Work Life Balance'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Bad | 80 | 5.0 | 5.0% |
| 1 | Best | 153 | 10.0 | 10.0% |
| 2 | Better | 896 | 61.0 | 61.0% |
| 3 | Good | 341 | 23.0 | 23.0% |
pd.DataFrame(univariate_pie_chart(df,'Education','Education'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Bachelor | 583 | 40.0 | 40.0% |
| 1 | Below College | 169 | 11.0 | 11.0% |
| 2 | College | 279 | 19.0 | 19.0% |
| 3 | Doctor | 48 | 3.0 | 3.0% |
| 4 | Master | 391 | 27.0 | 27.0% |
pd.DataFrame(univariate_pie_chart(df,'MaritalStatus','Marital Status'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Divorced | 326 | 22.0 | 22.0% |
| 1 | Married | 680 | 46.0 | 46.0% |
| 2 | Single | 464 | 32.0 | 32.0% |
pd.DataFrame(univariate_pie_chart(df,'PerformanceRating','Performance Rating'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Excellent | 1248 | 85.0 | 85.0% |
| 1 | Outstanding | 222 | 15.0 | 15.0% |
pd.DataFrame(univariate_pie_chart(df,'JobRole','Job Role'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | Healthcare Representative | 130 | 9.0 | 9.0% |
| 1 | Human Resources | 52 | 4.0 | 4.0% |
| 2 | Laboratory Technician | 256 | 17.0 | 17.0% |
| 3 | Manager | 102 | 7.0 | 7.0% |
| 4 | Manufacturing Director | 144 | 10.0 | 10.0% |
| 5 | Research Director | 79 | 5.0 | 5.0% |
| 6 | Research Scientist | 290 | 20.0 | 20.0% |
| 7 | Sales Executive | 335 | 23.0 | 23.0% |
| 8 | Sales Representative | 82 | 6.0 | 6.0% |
def group(x):
if x <18:
status= "< 18"
elif x<24:
status = "19-24"
elif x<34:
status = "25-34"
elif x<44:
status = "35-44"
elif x<54:
status = "45-54"
else:
status = "55 +"
return status
df['Age_group'] = df['Age'].apply(group)
import pandas as pd
import plotly.express as px
def univariate_table_diagram(df, variable, title):
table = pd.DataFrame(df.groupby([variable]).size().reset_index())
table.columns = ['Category', 'Counts']
table['Percentage'] = round(table['Counts'] / sum(table['Counts']) * 100, 0)
table['Percentage_with_sign'] = table['Percentage'].astype(str) + "%"
fig = px.bar(table,
x='Category',
y='Counts',
title=title,
text=table['Percentage_with_sign'].astype(str),color='Category')
fig.update_layout(template='plotly_white', title_x=0.5).show()
return table
pd.DataFrame(univariate_table_diagram(df,'Age_group','Age'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | 19-24 | 71 | 5.0 | 5.0% |
| 1 | 25-34 | 496 | 34.0 | 34.0% |
| 2 | 35-44 | 559 | 38.0 | 38.0% |
| 3 | 45-54 | 257 | 17.0 | 17.0% |
| 4 | 55 + | 87 | 6.0 | 6.0% |
import pandas as pd
import plotly.express as px
def univariate_table_diagram(df, variable, title):
table = pd.DataFrame(df.groupby([variable]).size().reset_index())
table.columns = ['Category', 'Counts']
table['Percentage'] = round(table['Counts'] / sum(table['Counts']) * 100, 0)
table['Percentage_with_sign'] = table['Percentage'].astype(str) + "%"
##table = table.sort_values('Counts', ascending=False) # Sort by Counts in descending order
fig = px.bar(table,
x='Category',
y='Counts',
title=title,
text=table['Percentage_with_sign'].astype(str),
color='Category')
fig.update_layout(template='plotly_white', title_x=0.5).show()
return table
import numpy as np
import pandas as pd
def categorize_daily_rate(daily_rate):
if daily_rate <= 200:
return "<200"
elif daily_rate <= 499:
return "201-499"
elif daily_rate <= 897:
return "500-897"
elif daily_rate <= 1296:
return "898-1296"
elif daily_rate <= 1697:
return "1297-1693"
else:
return ">1694"
df['DailyRate_Group'] = df['DailyRate'].apply(categorize_daily_rate)
df['DailyRate_Group'] = pd.Categorical(df['DailyRate_Group'], categories=['<200', '201-499', '500-897', '898-1296', '1297-1693', '>1694'], ordered=True)
pd.DataFrame(univariate_table_diagram(df,'DailyRate_Group','Daily Rate'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | <200 | 106 | 7.0 | 7.0% |
| 1 | 201-499 | 295 | 20.0 | 20.0% |
| 2 | 500-897 | 441 | 30.0 | 30.0% |
| 3 | 898-1296 | 406 | 28.0 | 28.0% |
| 4 | 1297-1693 | 221 | 15.0 | 15.0% |
| 5 | >1694 | 1 | 0.0 | 0.0% |
import numpy as np
import pandas as pd
def categorize_distance_f_h(dfh):
if dfh <= 5:
return "<5 KM"
elif dfh <= 10:
return "6-10 KM"
elif dfh <= 15:
return "11-15 KM"
elif dfh <= 20:
return "16-20 KM"
elif dfh <= 25:
return "21-25 KM"
else:
return "25 + KM"
df['Distance_Group'] = df['DistanceFromHome'].apply(categorize_distance_f_h)
df['Distance_Group'] = pd.Categorical(df['Distance_Group'], categories=['<5 KM', '6-10 KM', '11-15 KM', '16-20 KM', '21-25 KM', '25 + KM'], ordered=True)
pd.DataFrame(univariate_table_diagram(df,'Distance_Group','Distance From Home'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | <5 KM | 624 | 42.0 | 42.0% |
| 1 | 6-10 KM | 406 | 28.0 | 28.0% |
| 2 | 11-15 KM | 114 | 8.0 | 8.0% |
| 3 | 16-20 KM | 123 | 8.0 | 8.0% |
| 4 | 21-25 KM | 116 | 8.0 | 8.0% |
| 5 | 25 + KM | 87 | 6.0 | 6.0% |
import numpy as np
import pandas as pd
def categorize_mon_income(monthly_income):
if monthly_income <= 6000:
return "< 6000 Monthly Income"
elif monthly_income <= 11000:
return "6001 - 10999 Monthly Income"
elif monthly_income <= 16000:
return "11001-16000 Monthly Income"
elif monthly_income <= 21000:
return "16001-21000 Monthly Income"
else:
return "21000 + Monthly Income"
df['MonthlyIncome_Group'] = df['MonthlyIncome'].apply(categorize_mon_income)
df['MonthlyIncome_Group'] = pd.Categorical(df['MonthlyIncome_Group'], categories=['< 6000 Monthly Income', '6001 - 10999 Monthly Income', '11001-16000 Monthly Income', '16001-21000 Monthly Income', '21000 + Monthly Income'], ordered=True)
pd.DataFrame(univariate_table_diagram(df,'MonthlyIncome_Group','Monthly Income'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | < 6000 Monthly Income | 906 | 62.0 | 62.0% |
| 1 | 6001 - 10999 Monthly Income | 349 | 24.0 | 24.0% |
| 2 | 11001-16000 Monthly Income | 89 | 6.0 | 6.0% |
| 3 | 16001-21000 Monthly Income | 126 | 9.0 | 9.0% |
| 4 | 21000 + Monthly Income | 0 | 0.0 | 0.0% |
import numpy as np
import pandas as pd
def categorize_mon_rate(monthly_rate):
if monthly_rate <= 10000:
return "< 10000 Monthly Rate"
elif monthly_rate <= 18000:
return "10001 - 18000 Monthly Rate"
elif monthly_rate <= 26000:
return "18001-26000 Monthly Rate"
elif monthly_rate <= 34000:
return "26001-34000 Monthly Rate"
elif monthly_rate <=58000:
return "34001 - 58000 Monthly Rate"
else:
return "58001 + Monthly Rate"
df['MonthlyRate_Group'] = df['MonthlyRate'].apply(categorize_mon_rate)
df['MonthlyRate_Group'] = pd.Categorical(df['MonthlyRate_Group'], categories=['< 10000 Monthly Rate', '10001 - 18000 Monthly Rate', '18001-26000 Monthly Rate', '26001-34000 Monthly Rate', '34001 - 58000 Monthly Rate','58001 + Monthly Rate'], ordered=True)
pd.DataFrame(univariate_table_diagram(df,'MonthlyRate_Group','MonthlyRate Income'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | < 10000 Monthly Rate | 485 | 33.0 | 33.0% |
| 1 | 10001 - 18000 Monthly Rate | 474 | 32.0 | 32.0% |
| 2 | 18001-26000 Monthly Rate | 461 | 31.0 | 31.0% |
| 3 | 26001-34000 Monthly Rate | 49 | 3.0 | 3.0% |
| 4 | 34001 - 58000 Monthly Rate | 1 | 0.0 | 0.0% |
| 5 | 58001 + Monthly Rate | 0 | 0.0 | 0.0% |
import numpy as np
import pandas as pd
def categorize_company_work(work):
if work <= 1:
return "< 1 Company"
elif work <= 3:
return "2-3 Company"
elif work <= 5:
return "4-5 Company"
elif work <= 7:
return "6-7 Company"
elif work <=9:
return "8-9 Company"
else:
return "More than 9 Company"
df['NumCompaniesWorked_Group'] = df['NumCompaniesWorked'].apply(categorize_company_work)
df['NumCompaniesWorked_Group'] = pd.Categorical(df['NumCompaniesWorked_Group'], categories=['< 1 Company', '2-3 Company', '4-5 Company', '6-7 Company', '8-9 Company','More than 9 Company'], ordered=True)
pd.DataFrame(univariate_table_diagram(df,'NumCompaniesWorked_Group','Num Companies Worked'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | < 1 Company | 711 | 48.0 | 48.0% |
| 1 | 2-3 Company | 316 | 21.0 | 21.0% |
| 2 | 4-5 Company | 201 | 14.0 | 14.0% |
| 3 | 6-7 Company | 142 | 10.0 | 10.0% |
| 4 | 8-9 Company | 100 | 7.0 | 7.0% |
| 5 | More than 9 Company | 0 | 0.0 | 0.0% |
import numpy as np
import pandas as pd
def categorize_salary_hike(hike):
if hike <= 13:
return "< 13 percentage %"
elif hike <= 16:
return "14-16 percentage %"
elif hike <= 19:
return "17-19 percentage %"
elif hike <= 22:
return "20-22 percentage %"
else:
return "22 + percentage %"
df['PercentSalaryHike_Group'] = df['PercentSalaryHike'].apply(categorize_salary_hike)
df['PercentSalaryHike_Group'] = pd.Categorical(df['PercentSalaryHike_Group'], categories=['< 13 percentage %', '14-16 percentage %', '17-19 percentage %', '20-22 percentage %', '22 + percentage %'], ordered=True)
pd.DataFrame(univariate_table_diagram(df,'PercentSalaryHike_Group','Percent Salary Hike'))
| Category | Counts | Percentage | Percentage_with_sign | |
|---|---|---|---|---|
| 0 | < 13 percentage % | 610 | 41.0 | 41.0% |
| 1 | 14-16 percentage % | 393 | 27.0 | 27.0% |
| 2 | 17-19 percentage % | 244 | 17.0 | 17.0% |
| 3 | 20-22 percentage % | 159 | 11.0 | 11.0% |
| 4 | 22 + percentage % | 64 | 4.0 | 4.0% |
df
| employee_id | Age | BusinessTravel | DailyRate | Department | DistanceFromHome | Education | EducationField | EnvironmentSatisfaction | Gender | ... | TrainingTimesLastYear | WorkLifeBalance | Attrition | Age_group | DailyRate_Group | Distance_Group | MonthlyIncome_Group | MonthlyRate_Group | NumCompaniesWorked_Group | PercentSalaryHike_Group | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1453 | 41 | Travel_Rarely | 1102 | Sales | 1 | College | Life Sciences | Medium | Female | ... | 0 | Bad | Yes | 35-44 | 898-1296 | <5 KM | < 6000 Monthly Income | 18001-26000 Monthly Rate | 8-9 Company | < 13 percentage % |
| 1 | 1454 | 49 | Travel_Frequently | 279 | Research & Development | 8 | Below College | Life Sciences | High | Male | ... | 3 | Better | No | 45-54 | 201-499 | 6-10 KM | < 6000 Monthly Income | 18001-26000 Monthly Rate | < 1 Company | 22 + percentage % |
| 2 | 1455 | 37 | Travel_Rarely | 1373 | Research & Development | 2 | College | Other | Ver High | Male | ... | 3 | Better | Yes | 35-44 | 1297-1693 | <5 KM | < 6000 Monthly Income | < 10000 Monthly Rate | 6-7 Company | 14-16 percentage % |
| 3 | 1456 | 33 | Travel_Frequently | 1392 | Research & Development | 3 | Master | Life Sciences | Ver High | Female | ... | 3 | Better | No | 25-34 | 1297-1693 | <5 KM | < 6000 Monthly Income | 18001-26000 Monthly Rate | < 1 Company | < 13 percentage % |
| 4 | 1457 | 27 | Travel_Rarely | 591 | Research & Development | 2 | Below College | Medical | Low | Male | ... | 3 | Better | No | 25-34 | 500-897 | <5 KM | < 6000 Monthly Income | 10001 - 18000 Monthly Rate | 8-9 Company | < 13 percentage % |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1465 | 2918 | 36 | Travel_Frequently | 884 | Research & Development | 23 | College | Medical | High | Male | ... | 3 | Better | No | 35-44 | 500-897 | 21-25 KM | < 6000 Monthly Income | 10001 - 18000 Monthly Rate | 4-5 Company | 17-19 percentage % |
| 1466 | 2919 | 39 | Travel_Rarely | 613 | Research & Development | 6 | Below College | Medical | Ver High | Male | ... | 5 | Better | No | 35-44 | 500-897 | 6-10 KM | 6001 - 10999 Monthly Income | 18001-26000 Monthly Rate | 4-5 Company | 14-16 percentage % |
| 1467 | 2920 | 27 | Travel_Rarely | 155 | Research & Development | 4 | Bachelor | Life Sciences | Medium | Male | ... | 0 | Better | No | 25-34 | <200 | <5 KM | 6001 - 10999 Monthly Income | < 10000 Monthly Rate | < 1 Company | 20-22 percentage % |
| 1468 | 2921 | 49 | Travel_Frequently | 1023 | Sales | 2 | Bachelor | Medical | Ver High | Male | ... | 3 | Good | No | 45-54 | 898-1296 | <5 KM | < 6000 Monthly Income | 10001 - 18000 Monthly Rate | 2-3 Company | 14-16 percentage % |
| 1469 | 2922 | 34 | Travel_Rarely | 628 | Research & Development | 8 | Bachelor | Medical | Medium | Male | ... | 3 | Best | No | 35-44 | 500-897 | 6-10 KM | < 6000 Monthly Income | 10001 - 18000 Monthly Rate | 2-3 Company | < 13 percentage % |
1470 rows × 33 columns
def bivariate_table(df, row, column):
table = df.groupby([row, column]).size().reset_index()
table['percentage'] = df.groupby([row, column]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values.round(1)
table.columns = [row, column, 'Counts', 'Percentage']
table['Percentage'] = table['Percentage'].astype(str) + '%'
return table
bivariate_table(df,'BusinessTravel','Attrition')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\89530821.py:3: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| BusinessTravel | Attrition | Counts | Percentage | |
|---|---|---|---|---|
| 0 | Non-Travel | No | 137 | 92.6% |
| 1 | Non-Travel | Yes | 11 | 7.4% |
| 2 | Travel_Frequently | No | 205 | 75.1% |
| 3 | Travel_Frequently | Yes | 68 | 24.9% |
| 4 | Travel_Rarely | No | 891 | 84.9% |
| 5 | Travel_Rarely | Yes | 158 | 15.1% |
import pandas as pd
import plotly.express as px
def create_bivariate_stacked_bar_chart(df, row, column, title, xaxis_title, yaxis_title):
def bivariate_table(df, row, column):
table = df.groupby([row, column]).size().reset_index()
table['percentage'] = df.groupby([row, column]).size().groupby(level=0).apply(lambda x: 100 * x / float(x.sum())).values.round(1)
table.columns = [row, column, 'Counts', 'Percentage']
table['Percent'] = table['Percentage'].astype(str) + '%'
return table
table = bivariate_table(df, row, column)
fig = px.bar(table, x=row, y='Percentage', color=column, barmode='stack', text=table['Percentage'])
fig.update_layout(title=title, xaxis_title=xaxis_title, yaxis_title=yaxis_title, width=800, height=600)
fig.show()
return table
create_bivariate_stacked_bar_chart(df,'BusinessTravel','Attrition','Business Travel vs Attrition', 'Business Travel', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| BusinessTravel | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Non-Travel | No | 137 | 92.6 | 92.6% |
| 1 | Non-Travel | Yes | 11 | 7.4 | 7.4% |
| 2 | Travel_Frequently | No | 205 | 75.1 | 75.1% |
| 3 | Travel_Frequently | Yes | 68 | 24.9 | 24.9% |
| 4 | Travel_Rarely | No | 891 | 84.9 | 84.9% |
| 5 | Travel_Rarely | Yes | 158 | 15.1 | 15.1% |
create_bivariate_stacked_bar_chart(df,'Age_group','Attrition','Age vs Attrition', 'Age', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| Age_group | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | 19-24 | No | 40 | 56.3 | 56.3% |
| 1 | 19-24 | Yes | 31 | 43.7 | 43.7% |
| 2 | 25-34 | No | 389 | 78.4 | 78.4% |
| 3 | 25-34 | Yes | 107 | 21.6 | 21.6% |
| 4 | 35-44 | No | 502 | 89.8 | 89.8% |
| 5 | 35-44 | Yes | 57 | 10.2 | 10.2% |
| 6 | 45-54 | No | 226 | 87.9 | 87.9% |
| 7 | 45-54 | Yes | 31 | 12.1 | 12.1% |
| 8 | 55 + | No | 76 | 87.4 | 87.4% |
| 9 | 55 + | Yes | 11 | 12.6 | 12.6% |
create_bivariate_stacked_bar_chart(df,'DailyRate_Group','Attrition','Daily Rate vs Attrition', 'Daily Rate', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| DailyRate_Group | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | <200 | No | 93 | 87.7 | 87.7% |
| 1 | <200 | Yes | 13 | 12.3 | 12.3% |
| 2 | 201-499 | No | 232 | 78.6 | 78.6% |
| 3 | 201-499 | Yes | 63 | 21.4 | 21.4% |
| 4 | 500-897 | No | 366 | 83.0 | 83.0% |
| 5 | 500-897 | Yes | 75 | 17.0 | 17.0% |
| 6 | 898-1296 | No | 353 | 86.9 | 86.9% |
| 7 | 898-1296 | Yes | 53 | 13.1 | 13.1% |
| 8 | 1297-1693 | No | 188 | 85.1 | 85.1% |
| 9 | 1297-1693 | Yes | 33 | 14.9 | 14.9% |
| 10 | >1694 | No | 1 | 100.0 | 100.0% |
| 11 | >1694 | Yes | 0 | 0.0 | 0.0% |
create_bivariate_stacked_bar_chart(df,'Distance_Group','Attrition','Distance From Home vs Attrition', 'Distance From Home', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| Distance_Group | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | <5 KM | No | 539 | 86.4 | 86.4% |
| 1 | <5 KM | Yes | 85 | 13.6 | 13.6% |
| 2 | 6-10 KM | No | 346 | 85.2 | 85.2% |
| 3 | 6-10 KM | Yes | 60 | 14.8 | 14.8% |
| 4 | 11-15 KM | No | 89 | 78.1 | 78.1% |
| 5 | 11-15 KM | Yes | 25 | 21.9 | 21.9% |
| 6 | 16-20 KM | No | 101 | 82.1 | 82.1% |
| 7 | 16-20 KM | Yes | 22 | 17.9 | 17.9% |
| 8 | 21-25 KM | No | 84 | 72.4 | 72.4% |
| 9 | 21-25 KM | Yes | 32 | 27.6 | 27.6% |
| 10 | 25 + KM | No | 74 | 85.1 | 85.1% |
| 11 | 25 + KM | Yes | 13 | 14.9 | 14.9% |
create_bivariate_stacked_bar_chart(df,'MonthlyIncome_Group','Attrition','Monthly Income vs Attrition', 'Monthly Income', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| MonthlyIncome_Group | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | < 6000 Monthly Income | No | 727 | 80.2 | 80.2% |
| 1 | < 6000 Monthly Income | Yes | 179 | 19.8 | 19.8% |
| 2 | 6001 - 10999 Monthly Income | No | 303 | 86.8 | 86.8% |
| 3 | 6001 - 10999 Monthly Income | Yes | 46 | 13.2 | 13.2% |
| 4 | 11001-16000 Monthly Income | No | 82 | 92.1 | 92.1% |
| 5 | 11001-16000 Monthly Income | Yes | 7 | 7.9 | 7.9% |
| 6 | 16001-21000 Monthly Income | No | 121 | 96.0 | 96.0% |
| 7 | 16001-21000 Monthly Income | Yes | 5 | 4.0 | 4.0% |
| 8 | 21000 + Monthly Income | No | 0 | NaN | nan% |
| 9 | 21000 + Monthly Income | Yes | 0 | NaN | nan% |
create_bivariate_stacked_bar_chart(df,'MonthlyRate_Group','Attrition','Monthly Rate vs Attrition', 'Monthly Rate', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| MonthlyRate_Group | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | < 10000 Monthly Rate | No | 414 | 85.4 | 85.4% |
| 1 | < 10000 Monthly Rate | Yes | 71 | 14.6 | 14.6% |
| 2 | 10001 - 18000 Monthly Rate | No | 393 | 82.9 | 82.9% |
| 3 | 10001 - 18000 Monthly Rate | Yes | 81 | 17.1 | 17.1% |
| 4 | 18001-26000 Monthly Rate | No | 387 | 83.9 | 83.9% |
| 5 | 18001-26000 Monthly Rate | Yes | 74 | 16.1 | 16.1% |
| 6 | 26001-34000 Monthly Rate | No | 39 | 79.6 | 79.6% |
| 7 | 26001-34000 Monthly Rate | Yes | 10 | 20.4 | 20.4% |
| 8 | 34001 - 58000 Monthly Rate | No | 0 | 0.0 | 0.0% |
| 9 | 34001 - 58000 Monthly Rate | Yes | 1 | 100.0 | 100.0% |
| 10 | 58001 + Monthly Rate | No | 0 | NaN | nan% |
| 11 | 58001 + Monthly Rate | Yes | 0 | NaN | nan% |
create_bivariate_stacked_bar_chart(df,'NumCompaniesWorked_Group','Attrition','Num Companies Worked vs Attrition', 'Num Companies Worked ', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| NumCompaniesWorked_Group | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | < 1 Company | No | 591 | 83.1 | 83.1% |
| 1 | < 1 Company | Yes | 120 | 16.9 | 16.9% |
| 2 | 2-3 Company | No | 282 | 89.2 | 89.2% |
| 3 | 2-3 Company | Yes | 34 | 10.8 | 10.8% |
| 4 | 4-5 Company | No | 168 | 83.6 | 83.6% |
| 5 | 4-5 Company | Yes | 33 | 16.4 | 16.4% |
| 6 | 6-7 Company | No | 109 | 76.8 | 76.8% |
| 7 | 6-7 Company | Yes | 33 | 23.2 | 23.2% |
| 8 | 8-9 Company | No | 83 | 83.0 | 83.0% |
| 9 | 8-9 Company | Yes | 17 | 17.0 | 17.0% |
| 10 | More than 9 Company | No | 0 | NaN | nan% |
| 11 | More than 9 Company | Yes | 0 | NaN | nan% |
create_bivariate_stacked_bar_chart(df,'PercentSalaryHike_Group','Attrition','Salary Hike vs Attrition', 'Salary Hike', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| PercentSalaryHike_Group | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | < 13 percentage % | No | 503 | 82.5 | 82.5% |
| 1 | < 13 percentage % | Yes | 107 | 17.5 | 17.5% |
| 2 | 14-16 percentage % | No | 335 | 85.2 | 85.2% |
| 3 | 14-16 percentage % | Yes | 58 | 14.8 | 14.8% |
| 4 | 17-19 percentage % | No | 209 | 85.7 | 85.7% |
| 5 | 17-19 percentage % | Yes | 35 | 14.3 | 14.3% |
| 6 | 20-22 percentage % | No | 135 | 84.9 | 84.9% |
| 7 | 20-22 percentage % | Yes | 24 | 15.1 | 15.1% |
| 8 | 22 + percentage % | No | 51 | 79.7 | 79.7% |
| 9 | 22 + percentage % | Yes | 13 | 20.3 | 20.3% |
create_bivariate_stacked_bar_chart(df,'Department','Attrition','Departmentvs Attrition', 'Department', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| Department | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Human Resources | No | 50 | 80.6 | 80.6% |
| 1 | Human Resources | Yes | 12 | 19.4 | 19.4% |
| 2 | Research & Development | No | 832 | 86.1 | 86.1% |
| 3 | Research & Development | Yes | 134 | 13.9 | 13.9% |
| 4 | Sales | No | 351 | 79.4 | 79.4% |
| 5 | Sales | Yes | 91 | 20.6 | 20.6% |
create_bivariate_stacked_bar_chart(df,'Education','Attrition','Education Attrition', 'Education', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| Education | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Bachelor | No | 482 | 82.7 | 82.7% |
| 1 | Bachelor | Yes | 101 | 17.3 | 17.3% |
| 2 | Below College | No | 139 | 82.2 | 82.2% |
| 3 | Below College | Yes | 30 | 17.8 | 17.8% |
| 4 | College | No | 235 | 84.2 | 84.2% |
| 5 | College | Yes | 44 | 15.8 | 15.8% |
| 6 | Doctor | No | 43 | 89.6 | 89.6% |
| 7 | Doctor | Yes | 5 | 10.4 | 10.4% |
| 8 | Master | No | 334 | 85.4 | 85.4% |
| 9 | Master | Yes | 57 | 14.6 | 14.6% |
create_bivariate_stacked_bar_chart(df,'EducationField','Attrition','Education Field vs Attrition', 'Education Field', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| EducationField | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Human Resources | No | 20 | 74.1 | 74.1% |
| 1 | Human Resources | Yes | 7 | 25.9 | 25.9% |
| 2 | Life Sciences | No | 522 | 85.2 | 85.2% |
| 3 | Life Sciences | Yes | 91 | 14.8 | 14.8% |
| 4 | Marketing | No | 123 | 78.3 | 78.3% |
| 5 | Marketing | Yes | 34 | 21.7 | 21.7% |
| 6 | Medical | No | 397 | 86.5 | 86.5% |
| 7 | Medical | Yes | 62 | 13.5 | 13.5% |
| 8 | Other | No | 71 | 86.6 | 86.6% |
| 9 | Other | Yes | 11 | 13.4 | 13.4% |
| 10 | Technical Degree | No | 100 | 75.8 | 75.8% |
| 11 | Technical Degree | Yes | 32 | 24.2 | 24.2% |
create_bivariate_stacked_bar_chart(df,'EnvironmentSatisfaction','Attrition','Environment Satisfaction vs Attrition', 'Environment Satisfaction', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| EnvironmentSatisfaction | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | High | No | 403 | 86.1 | 86.1% |
| 1 | High | Yes | 65 | 13.9 | 13.9% |
| 2 | Low | No | 208 | 74.6 | 74.6% |
| 3 | Low | Yes | 71 | 25.4 | 25.4% |
| 4 | Medium | No | 240 | 85.1 | 85.1% |
| 5 | Medium | Yes | 42 | 14.9 | 14.9% |
| 6 | Ver High | No | 382 | 86.6 | 86.6% |
| 7 | Ver High | Yes | 59 | 13.4 | 13.4% |
create_bivariate_stacked_bar_chart(df,'Gender','Attrition','Gender vs Attrition', 'Gender', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| Gender | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Female | No | 496 | 85.1 | 85.1% |
| 1 | Female | Yes | 87 | 14.9 | 14.9% |
| 2 | Male | No | 737 | 83.1 | 83.1% |
| 3 | Male | Yes | 150 | 16.9 | 16.9% |
create_bivariate_stacked_bar_chart(df,'JobInvolvement','Attrition','JobInvolvement vs Attrition', 'JobInvolvement', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| JobInvolvement | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | High | No | 746 | 85.5 | 85.5% |
| 1 | High | Yes | 127 | 14.5 | 14.5% |
| 2 | Low | No | 53 | 65.4 | 65.4% |
| 3 | Low | Yes | 28 | 34.6 | 34.6% |
| 4 | Medium | No | 303 | 81.5 | 81.5% |
| 5 | Medium | Yes | 69 | 18.5 | 18.5% |
| 6 | Very High | No | 131 | 91.0 | 91.0% |
| 7 | Very High | Yes | 13 | 9.0 | 9.0% |
create_bivariate_stacked_bar_chart(df,'JobRole','Attrition','JobRole vs Attrition', 'JobRole', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| JobRole | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Healthcare Representative | No | 121 | 93.1 | 93.1% |
| 1 | Healthcare Representative | Yes | 9 | 6.9 | 6.9% |
| 2 | Human Resources | No | 40 | 76.9 | 76.9% |
| 3 | Human Resources | Yes | 12 | 23.1 | 23.1% |
| 4 | Laboratory Technician | No | 194 | 75.8 | 75.8% |
| 5 | Laboratory Technician | Yes | 62 | 24.2 | 24.2% |
| 6 | Manager | No | 97 | 95.1 | 95.1% |
| 7 | Manager | Yes | 5 | 4.9 | 4.9% |
| 8 | Manufacturing Director | No | 134 | 93.1 | 93.1% |
| 9 | Manufacturing Director | Yes | 10 | 6.9 | 6.9% |
| 10 | Research Director | No | 77 | 97.5 | 97.5% |
| 11 | Research Director | Yes | 2 | 2.5 | 2.5% |
| 12 | Research Scientist | No | 245 | 84.5 | 84.5% |
| 13 | Research Scientist | Yes | 45 | 15.5 | 15.5% |
| 14 | Sales Executive | No | 276 | 82.4 | 82.4% |
| 15 | Sales Executive | Yes | 59 | 17.6 | 17.6% |
| 16 | Sales Representative | No | 49 | 59.8 | 59.8% |
| 17 | Sales Representative | Yes | 33 | 40.2 | 40.2% |
create_bivariate_stacked_bar_chart(df,'MaritalStatus','Attrition','MaritalStatus vs Attrition', 'MaritalStatus', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| MaritalStatus | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Divorced | No | 293 | 89.9 | 89.9% |
| 1 | Divorced | Yes | 33 | 10.1 | 10.1% |
| 2 | Married | No | 593 | 87.2 | 87.2% |
| 3 | Married | Yes | 87 | 12.8 | 12.8% |
| 4 | Single | No | 347 | 74.8 | 74.8% |
| 5 | Single | Yes | 117 | 25.2 | 25.2% |
create_bivariate_stacked_bar_chart(df,'PerformanceRating','Attrition','PerformanceRating vs Attrition', 'PerformanceRating', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| PerformanceRating | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Excellent | No | 1048 | 84.0 | 84.0% |
| 1 | Excellent | Yes | 200 | 16.0 | 16.0% |
| 2 | Outstanding | No | 185 | 83.3 | 83.3% |
| 3 | Outstanding | Yes | 37 | 16.7 | 16.7% |
create_bivariate_stacked_bar_chart(df,'RelationshipSatisfaction','Attrition','Relationship Satisfaction vs Attrition', 'Relationship Satisfaction', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| RelationshipSatisfaction | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | High | No | 395 | 84.6 | 84.6% |
| 1 | High | Yes | 72 | 15.4 | 15.4% |
| 2 | Low | No | 217 | 79.2 | 79.2% |
| 3 | Low | Yes | 57 | 20.8 | 20.8% |
| 4 | Medium | No | 256 | 85.0 | 85.0% |
| 5 | Medium | Yes | 45 | 15.0 | 15.0% |
| 6 | Very High | No | 365 | 85.3 | 85.3% |
| 7 | Very High | Yes | 63 | 14.7 | 14.7% |
create_bivariate_stacked_bar_chart(df,'WorkLifeBalance','Attrition','WorkLife Balance vs Attrition', 'WorkLife Balance', 'Pecentage %')
C:\Users\sasikumarchennova\AppData\Local\Temp\ipykernel_19564\4069780291.py:7: FutureWarning: Not prepending group keys to the result index of transform-like apply. In the future, the group keys will be included in the index, regardless of whether the applied function returns a like-indexed object. To preserve the previous behavior, use >>> .groupby(..., group_keys=False) To adopt the future behavior and silence this warning, use >>> .groupby(..., group_keys=True)
| WorkLifeBalance | Attrition | Counts | Percentage | Percent | |
|---|---|---|---|---|---|
| 0 | Bad | No | 55 | 68.8 | 68.8% |
| 1 | Bad | Yes | 25 | 31.2 | 31.2% |
| 2 | Best | No | 126 | 82.4 | 82.4% |
| 3 | Best | Yes | 27 | 17.6 | 17.6% |
| 4 | Better | No | 769 | 85.8 | 85.8% |
| 5 | Better | Yes | 127 | 14.2 | 14.2% |
| 6 | Good | No | 283 | 83.0 | 83.0% |
| 7 | Good | Yes | 58 | 17.0 | 17.0% |